{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0069fced",
   "metadata": {},
   "source": [
    "首先加载将要使用的软件包。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2a05d0f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "# data visualization\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "from matplotlib import pyplot as plt\n",
    "from matplotlib import style"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9e8a7e1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "../dataset/600031.csv\n"
     ]
    }
   ],
   "source": [
    "# Display the folders and files in current directory;\n",
    "import os\n",
    "for dirname, _, filenames in os.walk('../dataset/'):\n",
    "    for filename in filenames:\n",
    "        print(os.path.join(dirname, filename))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f774c46e",
   "metadata": {},
   "source": [
    "add Markdown\n",
    "调用pandas的函数 read_csv() 读取 csv 文件内容到DataFrame df_train。\n",
    "\n",
    "读取数据以后，检查 df_train 的最初几行数据，同时可以看到每一列特征值。有些特征是是数值型的，有一些是字符型的。其中'Survived'是旅客的幸存标志。这一幸存标志在以后的例子中会作为训练数据的标签(the targets for training data)。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4be03bd5",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>开盘</th>\n",
       "      <th>收盘</th>\n",
       "      <th>最高</th>\n",
       "      <th>最低</th>\n",
       "      <th>成交量</th>\n",
       "      <th>成交额</th>\n",
       "      <th>振幅</th>\n",
       "      <th>涨跌幅</th>\n",
       "      <th>涨跌额</th>\n",
       "      <th>换手率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-11-09</td>\n",
       "      <td>27.80</td>\n",
       "      <td>27.54</td>\n",
       "      <td>28.14</td>\n",
       "      <td>26.89</td>\n",
       "      <td>856249</td>\n",
       "      <td>2348800992</td>\n",
       "      <td>4.55</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.05</td>\n",
       "      <td>1.01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-11-10</td>\n",
       "      <td>27.98</td>\n",
       "      <td>27.32</td>\n",
       "      <td>27.98</td>\n",
       "      <td>26.91</td>\n",
       "      <td>558824</td>\n",
       "      <td>1523209504</td>\n",
       "      <td>3.89</td>\n",
       "      <td>-0.80</td>\n",
       "      <td>-0.22</td>\n",
       "      <td>0.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-11-11</td>\n",
       "      <td>27.33</td>\n",
       "      <td>28.31</td>\n",
       "      <td>28.86</td>\n",
       "      <td>27.21</td>\n",
       "      <td>1015374</td>\n",
       "      <td>2873205488</td>\n",
       "      <td>6.04</td>\n",
       "      <td>3.62</td>\n",
       "      <td>0.99</td>\n",
       "      <td>1.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-11-12</td>\n",
       "      <td>28.31</td>\n",
       "      <td>28.70</td>\n",
       "      <td>28.88</td>\n",
       "      <td>27.91</td>\n",
       "      <td>718595</td>\n",
       "      <td>2041558432</td>\n",
       "      <td>3.43</td>\n",
       "      <td>1.38</td>\n",
       "      <td>0.39</td>\n",
       "      <td>0.85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-11-13</td>\n",
       "      <td>28.28</td>\n",
       "      <td>28.16</td>\n",
       "      <td>28.50</td>\n",
       "      <td>27.60</td>\n",
       "      <td>674807</td>\n",
       "      <td>1888773520</td>\n",
       "      <td>3.14</td>\n",
       "      <td>-1.88</td>\n",
       "      <td>-0.54</td>\n",
       "      <td>0.80</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           日期     开盘     收盘     最高     最低      成交量         成交额    振幅   涨跌幅  \\\n",
       "0  2020-11-09  27.80  27.54  28.14  26.89   856249  2348800992  4.55  0.18   \n",
       "1  2020-11-10  27.98  27.32  27.98  26.91   558824  1523209504  3.89 -0.80   \n",
       "2  2020-11-11  27.33  28.31  28.86  27.21  1015374  2873205488  6.04  3.62   \n",
       "3  2020-11-12  28.31  28.70  28.88  27.91   718595  2041558432  3.43  1.38   \n",
       "4  2020-11-13  28.28  28.16  28.50  27.60   674807  1888773520  3.14 -1.88   \n",
       "\n",
       "    涨跌额   换手率  \n",
       "0  0.05  1.01  \n",
       "1 -0.22  0.66  \n",
       "2  0.99  1.20  \n",
       "3  0.39  0.85  \n",
       "4 -0.54  0.80  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load data\n",
    "df_train = pd.read_csv('../dataset/600031.csv')\n",
    "# Show first lines of data\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "425dcf0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.set_index('日期',inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6330b254",
   "metadata": {},
   "source": [
    "# 数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cca2a24f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\ndef mean_handle(total_list, cycle):\\n            return [total_list[i] / np.mean(total_list[i:i + cycle]) for i in range(len(total_list))]\\n'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 经实验并不合适”\n",
    "\"\"\"\n",
    "def mean_handle(total_list, cycle):\n",
    "            return [total_list[i] / np.mean(total_list[i:i + cycle]) for i in range(len(total_list))]\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "80557ab2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mean_handle(total_list, cycle):\n",
    "            return [np.mean(total_list[i:i + cycle]) for i in range(len(total_list))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "14211dba",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rate(total_list, cycle):\n",
    "            return [np.mean(total_list[i:i + cycle]) - total_list[i] for i in range(len(total_list))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c301d5c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def md(total_list, cycle):\n",
    "            return [np.std(total_list[i:i + cycle]) for i in range(len(total_list))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2e28896d",
   "metadata": {},
   "outputs": [],
   "source": [
    "cycle = 15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4789ef2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "for column in list(df_train.columns)[:]:\n",
    "            df_train[column] = mean_handle(df_train[column].tolist(), cycle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "819b9187",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['rate'] = rate(df_train['收盘'].tolist(), cycle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "dd0b45ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['md'] = md(df_train['收盘'].tolist(), cycle)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ce7431b",
   "metadata": {},
   "source": [
    "# 作差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bb8fb773",
   "metadata": {},
   "outputs": [],
   "source": [
    "for column in df_train.columns:\n",
    "    datalist = list()\n",
    "    datalist.append(df_train[column].tolist()[0] - df_train[column].tolist()[cycle])\n",
    "    for i in range(1, len(df_train[column])):\n",
    "        datalist.append(df_train[column].tolist()[i] - df_train[column].tolist()[i - 1])\n",
    "    df_train[column] = datalist"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76e2c40c",
   "metadata": {},
   "source": [
    "# 数据检查"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f84c04fe",
   "metadata": {},
   "source": [
    "df_train.shape 显示数据的行数和列数；df_info() 显示数据的基本信息，包括每一列特征值的名称，行数，数值类型，和占用的内存空间。\n",
    "\n",
    "df_trian.desvribe()显示数据每一列的统计信息：平均值，方差，最大值，最小值，分位数，等。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "dbd43b9b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>开盘</th>\n",
       "      <th>收盘</th>\n",
       "      <th>最高</th>\n",
       "      <th>最低</th>\n",
       "      <th>成交量</th>\n",
       "      <th>成交额</th>\n",
       "      <th>振幅</th>\n",
       "      <th>涨跌幅</th>\n",
       "      <th>涨跌额</th>\n",
       "      <th>换手率</th>\n",
       "      <th>rate</th>\n",
       "      <th>md</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>日期</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2020-11-09</th>\n",
       "      <td>-3.052000</td>\n",
       "      <td>-2.904000</td>\n",
       "      <td>-2.967333</td>\n",
       "      <td>-3.043333</td>\n",
       "      <td>17325.666667</td>\n",
       "      <td>-145895276.8</td>\n",
       "      <td>0.620667</td>\n",
       "      <td>0.740000</td>\n",
       "      <td>0.211333</td>\n",
       "      <td>0.021333</td>\n",
       "      <td>0.778622</td>\n",
       "      <td>-0.039870</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2020-11-10</th>\n",
       "      <td>0.218667</td>\n",
       "      <td>0.211333</td>\n",
       "      <td>0.236667</td>\n",
       "      <td>0.248667</td>\n",
       "      <td>8752.800000</td>\n",
       "      <td>48122080.0</td>\n",
       "      <td>-0.074000</td>\n",
       "      <td>-0.098000</td>\n",
       "      <td>-0.030000</td>\n",
       "      <td>0.010667</td>\n",
       "      <td>-0.017733</td>\n",
       "      <td>-0.053689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2020-11-11</th>\n",
       "      <td>0.190000</td>\n",
       "      <td>0.327333</td>\n",
       "      <td>0.283333</td>\n",
       "      <td>0.254000</td>\n",
       "      <td>24143.733333</td>\n",
       "      <td>92502323.2</td>\n",
       "      <td>0.068667</td>\n",
       "      <td>0.383333</td>\n",
       "      <td>0.116000</td>\n",
       "      <td>0.028667</td>\n",
       "      <td>-0.146533</td>\n",
       "      <td>-0.070238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2020-11-12</th>\n",
       "      <td>0.326667</td>\n",
       "      <td>0.272000</td>\n",
       "      <td>0.265333</td>\n",
       "      <td>0.304667</td>\n",
       "      <td>-9884.800000</td>\n",
       "      <td>-4604057.6</td>\n",
       "      <td>-0.183333</td>\n",
       "      <td>-0.208000</td>\n",
       "      <td>-0.055333</td>\n",
       "      <td>-0.012000</td>\n",
       "      <td>-0.117156</td>\n",
       "      <td>-0.073551</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2020-11-13</th>\n",
       "      <td>0.247333</td>\n",
       "      <td>0.197333</td>\n",
       "      <td>0.222667</td>\n",
       "      <td>0.225333</td>\n",
       "      <td>-3179.533333</td>\n",
       "      <td>5604700.8</td>\n",
       "      <td>-0.037333</td>\n",
       "      <td>-0.242000</td>\n",
       "      <td>-0.074667</td>\n",
       "      <td>-0.004000</td>\n",
       "      <td>-0.060044</td>\n",
       "      <td>-0.073739</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  开盘        收盘        最高        最低           成交量          成交额  \\\n",
       "日期                                                                              \n",
       "2020-11-09 -3.052000 -2.904000 -2.967333 -3.043333  17325.666667 -145895276.8   \n",
       "2020-11-10  0.218667  0.211333  0.236667  0.248667   8752.800000   48122080.0   \n",
       "2020-11-11  0.190000  0.327333  0.283333  0.254000  24143.733333   92502323.2   \n",
       "2020-11-12  0.326667  0.272000  0.265333  0.304667  -9884.800000   -4604057.6   \n",
       "2020-11-13  0.247333  0.197333  0.222667  0.225333  -3179.533333    5604700.8   \n",
       "\n",
       "                  振幅       涨跌幅       涨跌额       换手率      rate        md  \n",
       "日期                                                                      \n",
       "2020-11-09  0.620667  0.740000  0.211333  0.021333  0.778622 -0.039870  \n",
       "2020-11-10 -0.074000 -0.098000 -0.030000  0.010667 -0.017733 -0.053689  \n",
       "2020-11-11  0.068667  0.383333  0.116000  0.028667 -0.146533 -0.070238  \n",
       "2020-11-12 -0.183333 -0.208000 -0.055333 -0.012000 -0.117156 -0.073551  \n",
       "2020-11-13 -0.037333 -0.242000 -0.074667 -0.004000 -0.060044 -0.073739  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "975cf5e1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(252, 12)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "15f3fdc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 252 entries, 2020-11-09 to 2021-11-19\n",
      "Data columns (total 12 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   开盘      252 non-null    float64\n",
      " 1   收盘      252 non-null    float64\n",
      " 2   最高      252 non-null    float64\n",
      " 3   最低      252 non-null    float64\n",
      " 4   成交量     252 non-null    float64\n",
      " 5   成交额     252 non-null    float64\n",
      " 6   振幅      252 non-null    float64\n",
      " 7   涨跌幅     252 non-null    float64\n",
      " 8   涨跌额     252 non-null    float64\n",
      " 9   换手率     252 non-null    float64\n",
      " 10  rate    252 non-null    float64\n",
      " 11  md      252 non-null    float64\n",
      "dtypes: float64(12)\n",
      "memory usage: 25.6+ KB\n"
     ]
    }
   ],
   "source": [
    "df_train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b85f8e76",
   "metadata": {},
   "source": [
    "# **缺失数据的处理**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d87cd4e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to check the missing percent of a DatFrame;\n",
    "def check_missing_data(df):\n",
    "    total = df.isnull().sum().sort_values(ascending = False)\n",
    "    percent = round(df.isnull().sum().sort_values(ascending = False) * 100 /len(df),2)\n",
    "    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2e8eca89",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Total</th>\n",
       "      <th>Percent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>开盘</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>收盘</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>最高</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>最低</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>成交量</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>成交额</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>振幅</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>涨跌幅</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>涨跌额</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>换手率</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rate</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>md</th>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Total  Percent\n",
       "开盘        0      0.0\n",
       "收盘        0      0.0\n",
       "最高        0      0.0\n",
       "最低        0      0.0\n",
       "成交量       0      0.0\n",
       "成交额       0      0.0\n",
       "振幅        0      0.0\n",
       "涨跌幅       0      0.0\n",
       "涨跌额       0      0.0\n",
       "换手率       0      0.0\n",
       "rate      0      0.0\n",
       "md        0      0.0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "check_missing_data(df_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c41332a7",
   "metadata": {},
   "source": [
    "一个简单的处理缺失数据的方式，就是用0和1填补 'something'的缺失数据。因为something本身的字符没有意义，所以在 something缺失的位置填0，并且用1替代something的原有非空字符。"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0e9460e",
   "metadata": {},
   "source": [
    "此外：\n",
    "* 还可以用对应列的平均值填补缺失的数据。\n",
    "\n",
    "    `#complete missing age with median\n",
    "    data['Age'].fillna(data['Age'].mean(), inplace = True)`\n",
    "\n",
    "* 用相应列中出现最多的字符来代替缺失的值，这通过 dataframe 的函数 data['Embarked'].mode() 来实现。\n",
    "\n",
    "    `#complete missing Embarked with Mode\n",
    "    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)`"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02dfcaaf",
   "metadata": {},
   "source": [
    "# **数据清理**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a8ee65f",
   "metadata": {},
   "source": [
    "* 检查冗余信息：相关性是检查冗余信息的有效方法。当某两列的相关系数非常高（接近1.0）的时候，我们认为这两列的信息是冗余的。以下的相关系数矩阵显示这个例子中没有冗余信息。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "61e8c58c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>开盘</th>\n",
       "      <th>收盘</th>\n",
       "      <th>最高</th>\n",
       "      <th>最低</th>\n",
       "      <th>成交量</th>\n",
       "      <th>成交额</th>\n",
       "      <th>振幅</th>\n",
       "      <th>涨跌幅</th>\n",
       "      <th>涨跌额</th>\n",
       "      <th>换手率</th>\n",
       "      <th>rate</th>\n",
       "      <th>md</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>开盘</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.952445</td>\n",
       "      <td>0.979336</td>\n",
       "      <td>0.984174</td>\n",
       "      <td>0.169610</td>\n",
       "      <td>0.380619</td>\n",
       "      <td>0.095315</td>\n",
       "      <td>-0.218084</td>\n",
       "      <td>-0.208078</td>\n",
       "      <td>0.168923</td>\n",
       "      <td>-0.604834</td>\n",
       "      <td>0.013612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>收盘</th>\n",
       "      <td>0.952445</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.983576</td>\n",
       "      <td>0.977298</td>\n",
       "      <td>0.223783</td>\n",
       "      <td>0.424309</td>\n",
       "      <td>0.177599</td>\n",
       "      <td>0.064763</td>\n",
       "      <td>0.081368</td>\n",
       "      <td>0.223228</td>\n",
       "      <td>-0.629480</td>\n",
       "      <td>-0.010546</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>最高</th>\n",
       "      <td>0.979336</td>\n",
       "      <td>0.983576</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.979857</td>\n",
       "      <td>0.259323</td>\n",
       "      <td>0.470285</td>\n",
       "      <td>0.230041</td>\n",
       "      <td>-0.054147</td>\n",
       "      <td>-0.047245</td>\n",
       "      <td>0.258750</td>\n",
       "      <td>-0.629179</td>\n",
       "      <td>-0.014120</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>最低</th>\n",
       "      <td>0.984174</td>\n",
       "      <td>0.977298</td>\n",
       "      <td>0.979857</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.133903</td>\n",
       "      <td>0.340743</td>\n",
       "      <td>0.036743</td>\n",
       "      <td>-0.104975</td>\n",
       "      <td>-0.085696</td>\n",
       "      <td>0.133167</td>\n",
       "      <td>-0.606272</td>\n",
       "      <td>0.010978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>成交量</th>\n",
       "      <td>0.169610</td>\n",
       "      <td>0.223783</td>\n",
       "      <td>0.259323</td>\n",
       "      <td>0.133903</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.959506</td>\n",
       "      <td>0.723384</td>\n",
       "      <td>0.244660</td>\n",
       "      <td>0.165332</td>\n",
       "      <td>0.999989</td>\n",
       "      <td>-0.096415</td>\n",
       "      <td>-0.083808</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>成交额</th>\n",
       "      <td>0.380619</td>\n",
       "      <td>0.424309</td>\n",
       "      <td>0.470285</td>\n",
       "      <td>0.340743</td>\n",
       "      <td>0.959506</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.737848</td>\n",
       "      <td>0.192398</td>\n",
       "      <td>0.124535</td>\n",
       "      <td>0.959433</td>\n",
       "      <td>-0.260780</td>\n",
       "      <td>-0.093909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>振幅</th>\n",
       "      <td>0.095315</td>\n",
       "      <td>0.177599</td>\n",
       "      <td>0.230041</td>\n",
       "      <td>0.036743</td>\n",
       "      <td>0.723384</td>\n",
       "      <td>0.737848</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.348179</td>\n",
       "      <td>0.274808</td>\n",
       "      <td>0.723996</td>\n",
       "      <td>-0.162753</td>\n",
       "      <td>-0.108055</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>涨跌幅</th>\n",
       "      <td>-0.218084</td>\n",
       "      <td>0.064763</td>\n",
       "      <td>-0.054147</td>\n",
       "      <td>-0.104975</td>\n",
       "      <td>0.244660</td>\n",
       "      <td>0.192398</td>\n",
       "      <td>0.348179</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.976770</td>\n",
       "      <td>0.244895</td>\n",
       "      <td>-0.030371</td>\n",
       "      <td>-0.017659</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>涨跌额</th>\n",
       "      <td>-0.208078</td>\n",
       "      <td>0.081368</td>\n",
       "      <td>-0.047245</td>\n",
       "      <td>-0.085696</td>\n",
       "      <td>0.165332</td>\n",
       "      <td>0.124535</td>\n",
       "      <td>0.274808</td>\n",
       "      <td>0.976770</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.165566</td>\n",
       "      <td>-0.039074</td>\n",
       "      <td>-0.042727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>换手率</th>\n",
       "      <td>0.168923</td>\n",
       "      <td>0.223228</td>\n",
       "      <td>0.258750</td>\n",
       "      <td>0.133167</td>\n",
       "      <td>0.999989</td>\n",
       "      <td>0.959433</td>\n",
       "      <td>0.723996</td>\n",
       "      <td>0.244895</td>\n",
       "      <td>0.165566</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.096472</td>\n",
       "      <td>-0.084485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rate</th>\n",
       "      <td>-0.604834</td>\n",
       "      <td>-0.629480</td>\n",
       "      <td>-0.629179</td>\n",
       "      <td>-0.606272</td>\n",
       "      <td>-0.096415</td>\n",
       "      <td>-0.260780</td>\n",
       "      <td>-0.162753</td>\n",
       "      <td>-0.030371</td>\n",
       "      <td>-0.039074</td>\n",
       "      <td>-0.096472</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.057477</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>md</th>\n",
       "      <td>0.013612</td>\n",
       "      <td>-0.010546</td>\n",
       "      <td>-0.014120</td>\n",
       "      <td>0.010978</td>\n",
       "      <td>-0.083808</td>\n",
       "      <td>-0.093909</td>\n",
       "      <td>-0.108055</td>\n",
       "      <td>-0.017659</td>\n",
       "      <td>-0.042727</td>\n",
       "      <td>-0.084485</td>\n",
       "      <td>0.057477</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            开盘        收盘        最高        最低       成交量       成交额        振幅  \\\n",
       "开盘    1.000000  0.952445  0.979336  0.984174  0.169610  0.380619  0.095315   \n",
       "收盘    0.952445  1.000000  0.983576  0.977298  0.223783  0.424309  0.177599   \n",
       "最高    0.979336  0.983576  1.000000  0.979857  0.259323  0.470285  0.230041   \n",
       "最低    0.984174  0.977298  0.979857  1.000000  0.133903  0.340743  0.036743   \n",
       "成交量   0.169610  0.223783  0.259323  0.133903  1.000000  0.959506  0.723384   \n",
       "成交额   0.380619  0.424309  0.470285  0.340743  0.959506  1.000000  0.737848   \n",
       "振幅    0.095315  0.177599  0.230041  0.036743  0.723384  0.737848  1.000000   \n",
       "涨跌幅  -0.218084  0.064763 -0.054147 -0.104975  0.244660  0.192398  0.348179   \n",
       "涨跌额  -0.208078  0.081368 -0.047245 -0.085696  0.165332  0.124535  0.274808   \n",
       "换手率   0.168923  0.223228  0.258750  0.133167  0.999989  0.959433  0.723996   \n",
       "rate -0.604834 -0.629480 -0.629179 -0.606272 -0.096415 -0.260780 -0.162753   \n",
       "md    0.013612 -0.010546 -0.014120  0.010978 -0.083808 -0.093909 -0.108055   \n",
       "\n",
       "           涨跌幅       涨跌额       换手率      rate        md  \n",
       "开盘   -0.218084 -0.208078  0.168923 -0.604834  0.013612  \n",
       "收盘    0.064763  0.081368  0.223228 -0.629480 -0.010546  \n",
       "最高   -0.054147 -0.047245  0.258750 -0.629179 -0.014120  \n",
       "最低   -0.104975 -0.085696  0.133167 -0.606272  0.010978  \n",
       "成交量   0.244660  0.165332  0.999989 -0.096415 -0.083808  \n",
       "成交额   0.192398  0.124535  0.959433 -0.260780 -0.093909  \n",
       "振幅    0.348179  0.274808  0.723996 -0.162753 -0.108055  \n",
       "涨跌幅   1.000000  0.976770  0.244895 -0.030371 -0.017659  \n",
       "涨跌额   0.976770  1.000000  0.165566 -0.039074 -0.042727  \n",
       "换手率   0.244895  0.165566  1.000000 -0.096472 -0.084485  \n",
       "rate -0.030371 -0.039074 -0.096472  1.000000  0.057477  \n",
       "md   -0.017659 -0.042727 -0.084485  0.057477  1.000000  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.corr()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2162277",
   "metadata": {},
   "source": [
    "* 标准方差用于检测无用信息。当某一列的方差接近0.0时，这一列的所有值几乎时相同的，也就是提供了无用信息。这样的列可以被删除。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "34d2e227",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "开盘      3.168535e-01\n",
       "收盘      3.100984e-01\n",
       "最高      3.275610e-01\n",
       "最低      3.014797e-01\n",
       "成交量     5.482656e+04\n",
       "成交额     1.775729e+08\n",
       "振幅      1.886228e-01\n",
       "涨跌幅     3.077441e-01\n",
       "涨跌额     1.024433e-01\n",
       "换手率     6.459324e-02\n",
       "rate    1.915776e-01\n",
       "md      8.504177e-02\n",
       "dtype: float64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.std()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5244915",
   "metadata": {},
   "source": [
    "收盘       0.060372\n",
    "最高       0.063221\n",
    "最低       0.058446\n",
    "rate     0.051277\n",
    "md       0.016265\n",
    "都十分接近于0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8300a75f",
   "metadata": {},
   "source": [
    "# **非数值变量的处理**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e3804d4e",
   "metadata": {},
   "source": [
    "字符串类型，我们需要将其转化为数值变量。以下的代码中，我们将用0取代和用1取代。"
   ]
  },
  {
   "cell_type": "raw",
   "id": "9d0d7d6b",
   "metadata": {},
   "source": [
    "# Forexample:\n",
    "###  Convert ‘Sex’ feature into numeric.\n",
    "genders = {\"male\": 0, \"female\": 1}\n",
    "all_data = [df_train]\n",
    "\n",
    "for dataset in all_data:\n",
    "    dataset['Sex'] = dataset['Sex'].map(genders)\n",
    "df_train['Sex'].value_counts()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "deep",
   "language": "python",
   "name": "deep"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
